# Packages used for Data pre-processing, unsupervised classifications, and plotting
# Basic Library for data pre-processing and handling
import pandas as pd
# Library used to standardize data
from sklearn.preprocessing import MinMaxScaler
# Library for feature selection
from sklearn.feature_selection import VarianceThreshold
# Libraries used for unsupervised classification and clustering
# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
# K-means classification
from sklearn.cluster import KMeans
# PCA
from sklearn.decomposition import PCA
# Library for t-SNE projections
from sklearn.manifold import TSNE
# Library for UMAP projections
from umap.umap_ import UMAP
# Library for data visualization
import matplotlib.pyplot as plt
import plotly.express as px
# Load in the medulloblastoma sub-data from PBTA dataset
df = pd.read_csv("mb_pbta_mirna_tdata.csv")
df
| cancer | class | let-7a-2-3p | let-7a-3p | let-7a-5p | let-7b-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7d-5p | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Medulloblastoma | group4 | 0.000021 | 0.000396 | 0.623749 | 0.235732 | 0.001238 | 0.397037 | 0.002096 | 0.052650 | ... | 0.000000 | 0.001244 | 0.000000 | 0.867301 | 0.000000 | 0.000105 | 0.000003 | 0.467130 | 0.000651 | 0.023314 |
| 1 | Medulloblastoma | shh | 0.000066 | 0.000756 | 0.949854 | 0.459211 | 0.000312 | 0.488999 | 0.017457 | 0.195213 | ... | 0.000005 | 0.008301 | 0.000009 | 0.720100 | 0.000000 | 0.000156 | 0.000009 | 0.143314 | 0.001020 | 0.022673 |
| 2 | Medulloblastoma | shh | 0.000031 | 0.000266 | 0.360419 | 0.182326 | 0.000541 | 0.209765 | 0.009398 | 0.070637 | ... | 0.000009 | 0.007082 | 0.000002 | 0.265383 | 0.000007 | 0.000440 | 0.000012 | 0.128355 | 0.000518 | 0.010669 |
| 3 | Medulloblastoma | unknown | 0.000019 | 0.000116 | 0.232624 | 0.062143 | 0.000333 | 0.127962 | 0.006488 | 0.062143 | ... | 0.000038 | 0.000752 | 0.000013 | 0.078081 | 0.000708 | 0.551996 | 0.000049 | 0.156157 | 0.002838 | 0.086369 |
| 4 | Medulloblastoma | unknown | 0.000002 | 0.000076 | 0.178236 | 0.106488 | 0.000023 | 0.102282 | 0.001322 | 0.023747 | ... | 0.000009 | 0.000453 | 0.000000 | 0.001121 | 0.000000 | 0.000510 | 0.000000 | 0.007941 | 0.000240 | 0.004517 |
| 5 | Medulloblastoma | shh | 0.000144 | 0.000397 | 1.000000 | 0.541517 | 0.000707 | 0.670771 | 0.011183 | 0.117447 | ... | 0.000009 | 0.003454 | 0.000004 | 0.577943 | 0.000013 | 0.000201 | 0.000031 | 0.469522 | 0.001262 | 0.021444 |
| 6 | Medulloblastoma | group4 | 0.000140 | 0.000124 | 0.551274 | 0.232158 | 0.001113 | 0.385674 | 0.002171 | 0.040326 | ... | 0.000000 | 0.000202 | 0.000000 | 0.461546 | 0.000000 | 0.000700 | 0.000010 | 0.466586 | 0.000856 | 0.024345 |
| 7 | Medulloblastoma | group3 | 0.000012 | 0.000062 | 0.251669 | 0.045793 | 0.000146 | 0.093762 | 0.002515 | 0.039105 | ... | 0.000000 | 0.000502 | 0.000000 | 0.077160 | 0.000497 | 0.546387 | 0.000007 | 0.099849 | 0.001061 | 0.035009 |
| 8 | Medulloblastoma | group3 | 0.000037 | 0.000098 | 0.396680 | 0.095476 | 0.001044 | 0.295310 | 0.002314 | 0.034163 | ... | 0.000003 | 0.000138 | 0.000007 | 1.000000 | 0.000007 | 0.004850 | 0.000007 | 0.564924 | 0.001448 | 0.035173 |
| 9 | Medulloblastoma | shh | 0.000108 | 0.000110 | 0.209708 | 0.101648 | 0.000415 | 0.120649 | 0.003664 | 0.037256 | ... | 0.000007 | 0.000319 | 0.000000 | 0.171490 | 0.000000 | 0.000319 | 0.000007 | 0.087714 | 0.000293 | 0.007802 |
| 10 | Medulloblastoma | group3 | 0.000078 | 0.000323 | 0.729035 | 0.350305 | 0.004271 | 0.547715 | 0.011622 | 0.104420 | ... | 0.000000 | 0.000084 | 0.000000 | 0.674608 | 0.000066 | 0.033341 | 0.000030 | 0.620002 | 0.001262 | 0.023962 |
| 11 | Medulloblastoma | group4 | 0.000214 | 0.000421 | 0.556721 | 0.195960 | 0.000789 | 0.318187 | 0.001863 | 0.061527 | ... | 0.000000 | 0.000184 | 0.000000 | 1.000000 | 0.000000 | 0.003510 | 0.000013 | 0.369452 | 0.000489 | 0.019117 |
| 12 | Medulloblastoma | group3 | 0.000002 | 0.000048 | 0.101429 | 0.018495 | 0.000239 | 0.051470 | 0.000691 | 0.009525 | ... | 0.000003 | 0.002518 | 0.000003 | 0.010410 | 0.000812 | 0.520182 | 0.000000 | 0.086535 | 0.000807 | 0.027199 |
| 13 | Medulloblastoma | group4 | 0.000060 | 0.000322 | 0.859749 | 0.279340 | 0.001102 | 0.447632 | 0.003312 | 0.077513 | ... | 0.000009 | 0.001061 | 0.000000 | 0.581369 | 0.000230 | 0.179099 | 0.000014 | 0.471787 | 0.001277 | 0.041977 |
| 14 | Medulloblastoma | group4 | 0.000031 | 0.000118 | 0.551875 | 0.235117 | 0.000961 | 0.357061 | 0.002173 | 0.034875 | ... | 0.000000 | 0.000220 | 0.000000 | 0.705888 | 0.000000 | 0.000109 | 0.000005 | 0.409734 | 0.000509 | 0.020898 |
| 15 | Medulloblastoma | wnt | 0.000424 | 0.000206 | 0.642276 | 0.203788 | 0.000811 | 0.252406 | 0.006881 | 0.098099 | ... | 0.000017 | 0.000337 | 0.000004 | 0.067080 | 0.000111 | 0.057697 | 0.000021 | 0.297908 | 0.001037 | 0.026303 |
| 16 | Medulloblastoma | group4 | 0.000014 | 0.000288 | 0.798152 | 0.245753 | 0.002413 | 0.471158 | 0.004710 | 0.051783 | ... | 0.000000 | 0.001144 | 0.000005 | 0.593621 | 0.000140 | 0.054456 | 0.000009 | 0.671039 | 0.001163 | 0.023850 |
| 17 | Medulloblastoma | group4 | 0.000050 | 0.000261 | 0.781232 | 0.265801 | 0.001648 | 0.465114 | 0.004804 | 0.057230 | ... | 0.000000 | 0.000247 | 0.000000 | 0.535876 | 0.000007 | 0.001047 | 0.000007 | 0.562208 | 0.000901 | 0.024083 |
| 18 | Medulloblastoma | group4 | 0.000137 | 0.000168 | 0.458397 | 0.166863 | 0.000978 | 0.291314 | 0.002366 | 0.035318 | ... | 0.000000 | 0.000254 | 0.000000 | 0.741703 | 0.000000 | 0.000079 | 0.000004 | 0.491265 | 0.000541 | 0.020333 |
| 19 | Medulloblastoma | shh | 0.000124 | 0.000401 | 0.368938 | 0.180769 | 0.000356 | 0.226352 | 0.008084 | 0.077415 | ... | 0.000000 | 0.000601 | 0.000000 | 1.000000 | 0.000000 | 0.000029 | 0.000032 | 0.123235 | 0.000572 | 0.014250 |
| 20 | Medulloblastoma | unknown | 0.000034 | 0.000142 | 0.205612 | 0.037260 | 0.000710 | 0.075110 | 0.003272 | 0.032885 | ... | 0.000017 | 0.000551 | 0.000006 | 0.260882 | 0.000131 | 0.046837 | 0.000011 | 0.102397 | 0.001301 | 0.034053 |
| 21 | Medulloblastoma | shh | 0.000106 | 0.000586 | 0.920766 | 0.397983 | 0.001149 | 0.434060 | 0.015709 | 0.131198 | ... | 0.000010 | 0.007879 | 0.000000 | 0.704137 | 0.000019 | 0.000212 | 0.000014 | 0.204524 | 0.001432 | 0.030212 |
| 22 | Medulloblastoma | wnt | 0.000357 | 0.000120 | 0.493606 | 0.227094 | 0.000473 | 0.263465 | 0.013002 | 0.079173 | ... | 0.000010 | 0.000200 | 0.000007 | 0.063138 | 0.000113 | 0.086307 | 0.000017 | 0.213009 | 0.001227 | 0.032201 |
| 23 | Medulloblastoma | group4 | 0.000006 | 0.000275 | 0.763451 | 0.283993 | 0.001492 | 0.453823 | 0.003575 | 0.053941 | ... | 0.000000 | 0.000644 | 0.000000 | 1.000000 | 0.000000 | 0.001144 | 0.000003 | 0.486575 | 0.000909 | 0.028952 |
| 24 | Medulloblastoma | shh | 0.000182 | 0.000429 | 1.000000 | 0.486279 | 0.000504 | 0.510904 | 0.012095 | 0.117013 | ... | 0.000038 | 0.005361 | 0.000022 | 0.807937 | 0.000011 | 0.000091 | 0.000022 | 0.238661 | 0.001358 | 0.030914 |
| 25 | Medulloblastoma | unknown | 0.000013 | 0.000223 | 0.304406 | 0.035941 | 0.000120 | 0.070146 | 0.002086 | 0.029981 | ... | 0.000004 | 0.000960 | 0.000000 | 0.192141 | 0.000433 | 0.346263 | 0.000009 | 0.059269 | 0.000973 | 0.025020 |
| 26 | Medulloblastoma | group4 | 0.000062 | 0.000240 | 0.675143 | 0.289138 | 0.001190 | 0.464367 | 0.001609 | 0.053420 | ... | 0.000000 | 0.000335 | 0.000011 | 0.545092 | 0.000034 | 0.018969 | 0.000011 | 0.479933 | 0.000726 | 0.017606 |
| 27 | Medulloblastoma | group4 | 0.000040 | 0.000246 | 0.621175 | 0.272721 | 0.000997 | 0.447721 | 0.003115 | 0.066401 | ... | 0.000000 | 0.001653 | 0.000000 | 0.944576 | 0.000000 | 0.002384 | 0.000012 | 0.512581 | 0.001144 | 0.031311 |
| 28 | Medulloblastoma | shh | 0.000428 | 0.000191 | 0.661500 | 0.322800 | 0.000187 | 0.341035 | 0.015713 | 0.099822 | ... | 0.000004 | 0.004808 | 0.000000 | 0.604096 | 0.000054 | 0.061759 | 0.000000 | 0.258560 | 0.002194 | 0.068038 |
| 29 | Medulloblastoma | group3 | 0.000014 | 0.000232 | 0.953281 | 0.273074 | 0.000478 | 0.383432 | 0.003279 | 0.062892 | ... | 0.000018 | 0.000410 | 0.000000 | 1.000000 | 0.000328 | 0.271115 | 0.000009 | 0.273793 | 0.001471 | 0.041413 |
| 30 | Medulloblastoma | unknown | 0.000009 | 0.000143 | 0.599768 | 0.170709 | 0.000507 | 0.367731 | 0.001408 | 0.033312 | ... | 0.000000 | 0.000459 | 0.000000 | 0.422067 | 0.000023 | 0.017893 | 0.000003 | 0.438808 | 0.000616 | 0.021463 |
31 rows × 2085 columns
#df.loc[:,'let-7a-2-3p':]
# K-means classification
kmeans = KMeans(n_clusters=3, random_state=0).fit(df.loc[:,'let-7a-2-3p':])
kmeans.labels_
array([2, 1, 0, 0, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 2, 1, 2, 2, 2, 1, 0, 1,
1, 2, 1, 0, 2, 2, 1, 2, 2], dtype=int32)
subtypes = df['class'].tolist()
# Perform PCA
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)
df_pca2 = pca2.fit_transform(df.loc[:,'let-7a-2-3p':])
df_pca3 = pca3.fit_transform(df.loc[:,'let-7a-2-3p':])
# Display PCA plot
pca_2d = px.scatter(df_pca2, x=0, y=1, color=subtypes)
# Display 3D PCA plot
pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=subtypes)
pca_2d.show()
pca_3d.show()
# Feature Selection using Variance Threshold
# Create function to produce filtered PBTA dataset based on selected Variance Threshold value
def variance_threshold_selector(data, threshold=0.5):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
df_filtered = variance_threshold_selector(df.loc[:,'let-7a-2-3p':], 0.08)
df_filtered
| miR-124-3p | miR-125b-5p | miR-451a | miR-9-5p | |
|---|---|---|---|---|
| 0 | 0.164964 | 1.000000 | 0.044983 | 0.867301 |
| 1 | 0.171694 | 0.224326 | 0.122597 | 0.720100 |
| 2 | 0.186040 | 0.267954 | 1.000000 | 0.265383 |
| 3 | 0.637581 | 0.212198 | 0.203880 | 0.078081 |
| 4 | 0.000498 | 0.010839 | 1.000000 | 0.001121 |
| 5 | 0.085440 | 0.518798 | 0.055473 | 0.577943 |
| 6 | 0.168344 | 0.904995 | 1.000000 | 0.461546 |
| 7 | 1.000000 | 0.171924 | 0.114704 | 0.077160 |
| 8 | 0.098908 | 0.764619 | 0.834979 | 1.000000 |
| 9 | 0.017342 | 0.142732 | 1.000000 | 0.171490 |
| 10 | 0.103978 | 0.643773 | 0.273358 | 0.674608 |
| 11 | 0.097076 | 0.990018 | 0.106567 | 1.000000 |
| 12 | 1.000000 | 0.137437 | 0.088019 | 0.010410 |
| 13 | 0.651566 | 1.000000 | 0.833486 | 0.581369 |
| 14 | 0.136943 | 1.000000 | 0.056372 | 0.705888 |
| 15 | 0.154895 | 0.632354 | 0.110508 | 0.067080 |
| 16 | 0.307301 | 1.000000 | 0.512827 | 0.593621 |
| 17 | 0.307290 | 1.000000 | 0.384100 | 0.535876 |
| 18 | 0.151154 | 1.000000 | 0.146395 | 0.741703 |
| 19 | 0.091491 | 0.173460 | 0.043374 | 1.000000 |
| 20 | 0.043329 | 0.133269 | 0.229709 | 0.260882 |
| 21 | 0.125497 | 0.378697 | 0.727269 | 0.704137 |
| 22 | 0.057784 | 0.543812 | 0.087884 | 0.063138 |
| 23 | 0.229112 | 0.816339 | 0.042371 | 1.000000 |
| 24 | 0.078484 | 0.449800 | 0.086791 | 0.807937 |
| 25 | 1.000000 | 0.137851 | 0.383892 | 0.192141 |
| 26 | 0.360119 | 1.000000 | 0.706348 | 0.545092 |
| 27 | 0.361844 | 1.000000 | 0.618442 | 0.944576 |
| 28 | 0.487439 | 0.750882 | 0.182455 | 0.604096 |
| 29 | 0.579213 | 0.532002 | 0.637072 | 1.000000 |
| 30 | 0.471053 | 1.000000 | 0.161792 | 0.422067 |
# Perform PCA
df_filtered_pca2 = pca2.fit_transform(df_filtered)
df_filtered_pca3 = pca3.fit_transform(df_filtered)
# Display PCA plot based on filtered genes
filtered_pca_2d = px.scatter(df_filtered_pca2, x=0, y=1, color=subtypes)
# Display 3D PCA plot based on filtered genes
filtered_pca_3d = px.scatter_3d(df_filtered_pca3, x=0, y=1, z=2, color=subtypes)
filtered_pca_2d.show()
filtered_pca_3d.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(df.loc[:,'let-7a-2-3p':])
proj_3d = umap_3d.fit_transform(df.loc[:,'let-7a-2-3p':])
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_3d.update_traces(marker_size=5)
fig_2d.show()
fig_3d.show()
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(df_filtered)
proj_3d = umap_3d.fit_transform(df_filtered)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_3d.update_traces(marker_size=5)
fig_2d.show()
fig_3d.show()
# Libraries used for Machine Learning Classification
# Library used to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Library for measuring accuracy of ML models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Machine Learning model used
from sklearn.ensemble import RandomForestClassifier
# Split the dataset into features and labels
df = df.drop(['cancer'], axis=1)
X = df.loc[:, df.columns != 'class'].values
y = df.loc[:, df.columns == 'class'].values.ravel()
# Sanity check
# print(X[:5])
# print(y[:5])
# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(23, 2083) (8, 2083) (23,) (8,)
# Class Imbalance
fig = px.histogram(df, x="class", nbins=10)
fig.show()
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
rfc.fit(X_train, y_train)
# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
Accuracy: 0.5
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
labels=dict(x="Subtype", y="Subtype", color="Productivity"),
x=df['class'].unique().tolist(),
y=df['class'].unique().tolist()
)
disp.show()
# What are the most important features?
feature_list = df.columns
feature_list = feature_list.drop('class')
imp_features = pd.Series(rfc.feature_importances_, index=feature_list)
imp_genes = imp_features.sort_values(ascending=False).to_frame().reset_index()
imp_genes.columns = ["features", "importance"]
imp_genes_fil = imp_genes[~(imp_genes == 0.000000).any(axis=1)]
imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-4319 | 0.031400 |
| 1 | miR-887-3p | 0.017159 |
| 2 | miR-6762-3p | 0.016752 |
| 3 | miR-4324 | 0.016084 |
| 4 | miR-548aw | 0.014788 |
| ... | ... | ... |
| 209 | miR-4799-5p | 0.001317 |
| 210 | miR-591 | 0.001294 |
| 211 | miR-20b-5p | 0.001279 |
| 212 | miR-3131 | 0.001183 |
| 213 | miR-4723-3p | 0.001137 |
214 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(imp_genes_fil, x=imp_genes_fil.features, y=imp_genes_fil.importance)
fig.show()
# Make a list of genes that ML model found important
ml_fil_genes = imp_genes_fil['features'].tolist()
# Create a dataframe containing only those genes
ml_fil_df = df.filter(ml_fil_genes)
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_fil_df)
proj_3d = umap_3d.fit_transform(ml_fil_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_2d.show()
fig_3d.show()
Look for biological explanation of purples being separated, and red varying so much.
# Most impotyant miRNA gene used by ML model to distinguish between MB subtypes
ml_fil_genes[0:1]
['miR-4319']
# Create a dataframe containing only one gene
ml_fil0_df = df.filter(ml_fil_genes[0:1])
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_fil0_df)
proj_3d = umap_3d.fit_transform(ml_fil0_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_2d.show()
fig_3d.show()
# List of five most important miRNA genes used by ML model
ml_fil_genes[0:5]
['miR-4319', 'miR-887-3p', 'miR-6762-3p', 'miR-4324', 'miR-548aw']
# Create a dataframe containing only five genes
ml_fil5_df = df.filter(ml_fil_genes[0:5])
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_fil5_df)
proj_3d = umap_3d.fit_transform(ml_fil5_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_2d.show()
fig_3d.show()
# List of top ten genes by ML model
ml_fil_genes[0:10]
['miR-4319', 'miR-887-3p', 'miR-6762-3p', 'miR-4324', 'miR-548aw', 'miR-7111-3p', 'miR-181c-3p', 'miR-340-5p', 'miR-330-5p', 'miR-1298-5p']
# Create a dataframe containing only ten genes
ml_fil10_df = df.filter(ml_fil_genes[0:10])
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_fil10_df)
proj_3d = umap_3d.fit_transform(ml_fil10_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_2d.show()
fig_3d.show()
# List of top 20 genes from ML model
ml_fil_genes[0:20]
['miR-4319', 'miR-887-3p', 'miR-6762-3p', 'miR-4324', 'miR-548aw', 'miR-7111-3p', 'miR-181c-3p', 'miR-340-5p', 'miR-330-5p', 'miR-1298-5p', 'miR-181a-2-3p', 'let-7a-3p', 'miR-764', 'miR-190b', 'miR-5704', 'miR-139-5p', 'miR-770-5p', 'miR-195-5p', 'miR-455-3p', 'miR-4795-3p']
# Create a dataframe containing only twenty genes
ml_fil20_df = df.filter(ml_fil_genes[0:20])
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_fil20_df)
proj_3d = umap_3d.fit_transform(ml_fil20_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=subtypes)
fig_2d.show()
fig_3d.show()
# Significant genes found from ML algorithm
ml_fil_genes
['miR-4319', 'miR-887-3p', 'miR-6762-3p', 'miR-4324', 'miR-548aw', 'miR-7111-3p', 'miR-181c-3p', 'miR-340-5p', 'miR-330-5p', 'miR-1298-5p', 'miR-181a-2-3p', 'let-7a-3p', 'miR-764', 'miR-190b', 'miR-5704', 'miR-139-5p', 'miR-770-5p', 'miR-195-5p', 'miR-455-3p', 'miR-4795-3p', 'miR-577', 'miR-3616-3p', 'miR-105-3p', 'miR-676-5p', 'miR-887-5p', 'miR-876-3p', 'miR-125b-5p', 'miR-548x-3p', 'miR-6733-3p', 'miR-130b-3p', 'miR-487a-3p', 'miR-513c-3p', 'let-7c-3p', 'miR-488-3p', 'miR-431-3p', 'miR-378i', 'miR-449b-3p', 'miR-148b-3p', 'miR-6776-3p', 'miR-1298-3p', 'miR-5739', 'miR-33a-3p', 'miR-769-5p', 'miR-767-3p', 'miR-1247-5p', 'miR-455-5p', 'miR-511-5p', 'miR-124-5p', 'miR-96-3p', 'miR-744-3p', 'miR-432-3p', 'miR-125a-3p', 'let-7b-5p', 'miR-2467-3p', 'miR-100-5p', 'miR-3657', 'miR-216b-3p', 'miR-8076', 'miR-378b', 'miR-1910-3p', 'miR-7843-3p', 'miR-548m', 'miR-30b-3p', 'miR-4779', 'miR-4317', 'miR-378h', 'let-7f-5p', 'miR-26b-3p', 'miR-6800-3p', 'miR-204-5p', 'miR-3188', 'let-7f-2-3p', 'miR-195-3p', 'miR-624-5p', 'miR-582-5p', 'miR-409-3p', 'miR-1269a', 'miR-668-5p', 'miR-155-5p', 'miR-411-5p', 'miR-3618', 'miR-548h-5p', 'miR-1229-3p', 'miR-19a-5p', 'miR-891a-5p', 'miR-1271-5p', 'miR-216b-5p', 'miR-4751', 'miR-186-5p', 'miR-4646-3p', 'miR-5010-3p', 'miR-16-1-3p', 'let-7a-2-3p', 'miR-130a-3p', 'miR-6071', 'miR-1226-3p', 'miR-153-5p', 'miR-4283', 'miR-889-3p', 'miR-944', 'miR-383-5p', 'miR-450a-2-3p', 'miR-154-3p', 'miR-26a-5p', 'miR-588', 'miR-6514-5p', 'let-7d-5p', 'miR-6887-3p', 'miR-376c-3p', 'miR-4320', 'miR-6766-5p', 'miR-4705', 'miR-6828-3p', 'miR-1225-3p', 'miR-449c-3p', 'miR-28-5p', 'miR-382-3p', 'miR-17-5p', 'miR-204-3p', 'miR-4330', 'miR-4254', 'miR-4745-3p', 'let-7f-1-3p', 'miR-30c-2-3p', 'miR-4423-5p', 'miR-33a-5p', 'miR-30a-3p', 'miR-5589-3p', 'miR-1178-3p', 'miR-3681-5p', 'miR-7703', 'miR-1249', 'miR-543', 'miR-197-3p', 'miR-8070', 'miR-576-3p', 'miR-6733-5p', 'miR-99a-5p', 'miR-5195-5p', 'miR-412-5p', 'miR-510-5p', 'miR-192-5p', 'miR-374b-3p', 'miR-146a-5p', 'miR-135a-3p', 'miR-346', 'miR-378a-3p', 'miR-3146', 'miR-4680-5p', 'miR-21-3p', 'miR-4473', 'miR-921', 'miR-4662a-3p', 'miR-593-3p', 'miR-892b', 'miR-378e', 'let-7c-5p', 'miR-888-5p', 'miR-216a-5p', 'miR-642a-5p', 'miR-302a-5p', 'miR-873-5p', 'miR-18a-5p', 'miR-377-5p', 'miR-4738-5p', 'miR-182-3p', 'miR-4789-3p', 'miR-4536-5p', 'miR-4709-5p', 'miR-6817-3p', 'miR-153-3p', 'miR-433-5p', 'miR-135b-3p', 'miR-4713-5p', 'miR-670-3p', 'miR-4263', 'miR-671-5p', 'miR-3943', 'miR-20a-5p', 'miR-891a-3p', 'miR-370-5p', 'miR-3176', 'miR-6073', 'miR-548v', 'miR-340-3p', 'miR-3162-5p', 'miR-6789-5p', 'miR-449c-5p', 'miR-141-3p', 'miR-34a-5p', 'miR-3671', 'miR-7114-5p', 'miR-548ad', 'let-7d-3p', 'miR-502-5p', 'miR-376b-3p', 'miR-876-5p', 'miR-6131', 'miR-6813-3p', 'miR-421', 'miR-3191-3p', 'miR-147a', 'miR-1267', 'miR-181d-3p', 'miR-3607-3p', 'miR-134-5p', 'miR-6881-3p', 'miR-8054', 'miR-501-5p', 'miR-4799-5p', 'miR-591', 'miR-20b-5p', 'miR-3131', 'miR-4723-3p']
# Four significant miRNA genes found from variance threshold method
var_threshold_genes = df_filtered.columns.tolist()
var_threshold_genes
['miR-124-3p', 'miR-125b-5p', 'miR-451a', 'miR-9-5p']
# List of overlapping miRNA genes found from variance threshold method and ML algorithm
list(set(ml_fil_genes) & set(var_threshold_genes))
['miR-125b-5p']
LITERATURE SEARCH ON 'miR-125b-5p'
_df = df.copy()
_df.drop(_df[_df["class"] == "unknown"].index, inplace = True)
_df.reset_index(drop=True, inplace=True)
_df
| class | let-7a-2-3p | let-7a-3p | let-7a-5p | let-7b-5p | let-7c-3p | let-7c-5p | let-7d-3p | let-7d-5p | let-7e-3p | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | group4 | 0.000021 | 0.000396 | 0.623749 | 0.235732 | 0.001238 | 0.397037 | 0.002096 | 0.052650 | 0.000543 | ... | 0.000000 | 0.001244 | 0.000000 | 0.867301 | 0.000000 | 0.000105 | 0.000003 | 0.467130 | 0.000651 | 0.023314 |
| 1 | shh | 0.000066 | 0.000756 | 0.949854 | 0.459211 | 0.000312 | 0.488999 | 0.017457 | 0.195213 | 0.001016 | ... | 0.000005 | 0.008301 | 0.000009 | 0.720100 | 0.000000 | 0.000156 | 0.000009 | 0.143314 | 0.001020 | 0.022673 |
| 2 | shh | 0.000031 | 0.000266 | 0.360419 | 0.182326 | 0.000541 | 0.209765 | 0.009398 | 0.070637 | 0.000624 | ... | 0.000009 | 0.007082 | 0.000002 | 0.265383 | 0.000007 | 0.000440 | 0.000012 | 0.128355 | 0.000518 | 0.010669 |
| 3 | shh | 0.000144 | 0.000397 | 1.000000 | 0.541517 | 0.000707 | 0.670771 | 0.011183 | 0.117447 | 0.000838 | ... | 0.000009 | 0.003454 | 0.000004 | 0.577943 | 0.000013 | 0.000201 | 0.000031 | 0.469522 | 0.001262 | 0.021444 |
| 4 | group4 | 0.000140 | 0.000124 | 0.551274 | 0.232158 | 0.001113 | 0.385674 | 0.002171 | 0.040326 | 0.000615 | ... | 0.000000 | 0.000202 | 0.000000 | 0.461546 | 0.000000 | 0.000700 | 0.000010 | 0.466586 | 0.000856 | 0.024345 |
| 5 | group3 | 0.000012 | 0.000062 | 0.251669 | 0.045793 | 0.000146 | 0.093762 | 0.002515 | 0.039105 | 0.000665 | ... | 0.000000 | 0.000502 | 0.000000 | 0.077160 | 0.000497 | 0.546387 | 0.000007 | 0.099849 | 0.001061 | 0.035009 |
| 6 | group3 | 0.000037 | 0.000098 | 0.396680 | 0.095476 | 0.001044 | 0.295310 | 0.002314 | 0.034163 | 0.000384 | ... | 0.000003 | 0.000138 | 0.000007 | 1.000000 | 0.000007 | 0.004850 | 0.000007 | 0.564924 | 0.001448 | 0.035173 |
| 7 | shh | 0.000108 | 0.000110 | 0.209708 | 0.101648 | 0.000415 | 0.120649 | 0.003664 | 0.037256 | 0.000235 | ... | 0.000007 | 0.000319 | 0.000000 | 0.171490 | 0.000000 | 0.000319 | 0.000007 | 0.087714 | 0.000293 | 0.007802 |
| 8 | group3 | 0.000078 | 0.000323 | 0.729035 | 0.350305 | 0.004271 | 0.547715 | 0.011622 | 0.104420 | 0.000580 | ... | 0.000000 | 0.000084 | 0.000000 | 0.674608 | 0.000066 | 0.033341 | 0.000030 | 0.620002 | 0.001262 | 0.023962 |
| 9 | group4 | 0.000214 | 0.000421 | 0.556721 | 0.195960 | 0.000789 | 0.318187 | 0.001863 | 0.061527 | 0.000512 | ... | 0.000000 | 0.000184 | 0.000000 | 1.000000 | 0.000000 | 0.003510 | 0.000013 | 0.369452 | 0.000489 | 0.019117 |
| 10 | group3 | 0.000002 | 0.000048 | 0.101429 | 0.018495 | 0.000239 | 0.051470 | 0.000691 | 0.009525 | 0.000426 | ... | 0.000003 | 0.002518 | 0.000003 | 0.010410 | 0.000812 | 0.520182 | 0.000000 | 0.086535 | 0.000807 | 0.027199 |
| 11 | group4 | 0.000060 | 0.000322 | 0.859749 | 0.279340 | 0.001102 | 0.447632 | 0.003312 | 0.077513 | 0.001277 | ... | 0.000009 | 0.001061 | 0.000000 | 0.581369 | 0.000230 | 0.179099 | 0.000014 | 0.471787 | 0.001277 | 0.041977 |
| 12 | group4 | 0.000031 | 0.000118 | 0.551875 | 0.235117 | 0.000961 | 0.357061 | 0.002173 | 0.034875 | 0.000464 | ... | 0.000000 | 0.000220 | 0.000000 | 0.705888 | 0.000000 | 0.000109 | 0.000005 | 0.409734 | 0.000509 | 0.020898 |
| 13 | wnt | 0.000424 | 0.000206 | 0.642276 | 0.203788 | 0.000811 | 0.252406 | 0.006881 | 0.098099 | 0.000938 | ... | 0.000017 | 0.000337 | 0.000004 | 0.067080 | 0.000111 | 0.057697 | 0.000021 | 0.297908 | 0.001037 | 0.026303 |
| 14 | group4 | 0.000014 | 0.000288 | 0.798152 | 0.245753 | 0.002413 | 0.471158 | 0.004710 | 0.051783 | 0.000735 | ... | 0.000000 | 0.001144 | 0.000005 | 0.593621 | 0.000140 | 0.054456 | 0.000009 | 0.671039 | 0.001163 | 0.023850 |
| 15 | group4 | 0.000050 | 0.000261 | 0.781232 | 0.265801 | 0.001648 | 0.465114 | 0.004804 | 0.057230 | 0.000790 | ... | 0.000000 | 0.000247 | 0.000000 | 0.535876 | 0.000007 | 0.001047 | 0.000007 | 0.562208 | 0.000901 | 0.024083 |
| 16 | group4 | 0.000137 | 0.000168 | 0.458397 | 0.166863 | 0.000978 | 0.291314 | 0.002366 | 0.035318 | 0.000561 | ... | 0.000000 | 0.000254 | 0.000000 | 0.741703 | 0.000000 | 0.000079 | 0.000004 | 0.491265 | 0.000541 | 0.020333 |
| 17 | shh | 0.000124 | 0.000401 | 0.368938 | 0.180769 | 0.000356 | 0.226352 | 0.008084 | 0.077415 | 0.000693 | ... | 0.000000 | 0.000601 | 0.000000 | 1.000000 | 0.000000 | 0.000029 | 0.000032 | 0.123235 | 0.000572 | 0.014250 |
| 18 | shh | 0.000106 | 0.000586 | 0.920766 | 0.397983 | 0.001149 | 0.434060 | 0.015709 | 0.131198 | 0.001221 | ... | 0.000010 | 0.007879 | 0.000000 | 0.704137 | 0.000019 | 0.000212 | 0.000014 | 0.204524 | 0.001432 | 0.030212 |
| 19 | wnt | 0.000357 | 0.000120 | 0.493606 | 0.227094 | 0.000473 | 0.263465 | 0.013002 | 0.079173 | 0.000940 | ... | 0.000010 | 0.000200 | 0.000007 | 0.063138 | 0.000113 | 0.086307 | 0.000017 | 0.213009 | 0.001227 | 0.032201 |
| 20 | group4 | 0.000006 | 0.000275 | 0.763451 | 0.283993 | 0.001492 | 0.453823 | 0.003575 | 0.053941 | 0.000766 | ... | 0.000000 | 0.000644 | 0.000000 | 1.000000 | 0.000000 | 0.001144 | 0.000003 | 0.486575 | 0.000909 | 0.028952 |
| 21 | shh | 0.000182 | 0.000429 | 1.000000 | 0.486279 | 0.000504 | 0.510904 | 0.012095 | 0.117013 | 0.001100 | ... | 0.000038 | 0.005361 | 0.000022 | 0.807937 | 0.000011 | 0.000091 | 0.000022 | 0.238661 | 0.001358 | 0.030914 |
| 22 | group4 | 0.000062 | 0.000240 | 0.675143 | 0.289138 | 0.001190 | 0.464367 | 0.001609 | 0.053420 | 0.000464 | ... | 0.000000 | 0.000335 | 0.000011 | 0.545092 | 0.000034 | 0.018969 | 0.000011 | 0.479933 | 0.000726 | 0.017606 |
| 23 | group4 | 0.000040 | 0.000246 | 0.621175 | 0.272721 | 0.000997 | 0.447721 | 0.003115 | 0.066401 | 0.000739 | ... | 0.000000 | 0.001653 | 0.000000 | 0.944576 | 0.000000 | 0.002384 | 0.000012 | 0.512581 | 0.001144 | 0.031311 |
| 24 | shh | 0.000428 | 0.000191 | 0.661500 | 0.322800 | 0.000187 | 0.341035 | 0.015713 | 0.099822 | 0.001899 | ... | 0.000004 | 0.004808 | 0.000000 | 0.604096 | 0.000054 | 0.061759 | 0.000000 | 0.258560 | 0.002194 | 0.068038 |
| 25 | group3 | 0.000014 | 0.000232 | 0.953281 | 0.273074 | 0.000478 | 0.383432 | 0.003279 | 0.062892 | 0.000856 | ... | 0.000018 | 0.000410 | 0.000000 | 1.000000 | 0.000328 | 0.271115 | 0.000009 | 0.273793 | 0.001471 | 0.041413 |
26 rows × 2084 columns
# Split the dataset into features and labels
_X = _df.loc[:, _df.columns != 'class'].values
_y = _df.loc[:, _df.columns == 'class'].values.ravel()
# Sanity check
print(_X[:5])
print(_y[:5])
[[2.10000000e-05 3.95820000e-04 6.23749194e-01 ... 4.67130456e-01 6.50704000e-04 2.33143920e-02] [6.61000000e-05 7.55926000e-04 9.49853776e-01 ... 1.43314073e-01 1.02050000e-03 2.26730480e-02] [3.06000000e-05 2.65969000e-04 3.60419055e-01 ... 1.28354921e-01 5.17816000e-04 1.06693720e-02] [1.44096000e-04 3.97356000e-04 1.00000000e+00 ... 4.69521514e-01 1.26193200e-03 2.14441040e-02] [1.39986000e-04 1.23708000e-04 5.51273871e-01 ... 4.66585714e-01 8.56192000e-04 2.43445080e-02]] ['group4' 'shh' 'shh' 'shh' 'group4']
# Split data into training and testing set
_X_train, _X_test, _y_train, _y_test = train_test_split(_X, _y, test_size=0.25, random_state=42)
#Sanity check
print(_X_train.shape, _X_test.shape, _y_train.shape, _y_test.shape)
(19, 2083) (7, 2083) (19,) (7,)
# Class Imbalance
fig = px.histogram(_df, x="class", nbins=10)
fig.show()
# Initialize random forest classifier
rfc2 = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
rfc2.fit(_X_train, _y_train)
# Make predictions using random forest classifier
rfc2_y_pred = rfc.predict(_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(_y_test, rfc2_y_pred)}')
Accuracy: 0.8571428571428571
# Calculate a confusion matrix
cm2 = confusion_matrix(_y_test, rfc2_y_pred, labels=rfc2.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm2, text_auto=True,
labels=dict(x="True Subtype", y="Predicted Subtype", color="Productivity"),
x=_df['class'].unique().tolist(),
y=_df['class'].unique().tolist()
)
disp.show()
# What are the most important features?
rfc2_feature_list = _df.columns
rfc2_feature_list = rfc2_feature_list.drop('class')
rfc2_imp_features = pd.Series(rfc2.feature_importances_, index=rfc2_feature_list)
rfc2_imp_genes = rfc2_imp_features.sort_values(ascending=False).to_frame().reset_index()
rfc2_imp_genes.columns = ["features", "importance"]
rfc2_imp_genes_fil = rfc2_imp_genes[~(rfc2_imp_genes == 0.000000).any(axis=1)]
rfc2_imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-4324 | 0.019574 |
| 1 | miR-876-3p | 0.017665 |
| 2 | miR-3910 | 0.012948 |
| 3 | miR-378g | 0.012883 |
| 4 | miR-487a-3p | 0.012383 |
| ... | ... | ... |
| 192 | miR-378e | 0.001384 |
| 193 | miR-98-3p | 0.001380 |
| 194 | miR-93-3p | 0.001357 |
| 195 | miR-7162-3p | 0.001335 |
| 196 | miR-6741-5p | 0.001330 |
197 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(rfc2_imp_genes_fil, x=rfc2_imp_genes_fil.features, y=rfc2_imp_genes_fil.importance)
fig.show()
# Make a list of genes that ML model found important
rfc2_fil_genes = rfc2_imp_genes_fil['features'].tolist()
# Create a dataframe containing only those genes
rfc2_fil_df = _df.filter(rfc2_fil_genes)
# Make list of subtype labels
rfc2_subtypes = _df['class'].tolist()
# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(rfc2_fil_df)
proj_3d = umap_3d.fit_transform(rfc2_fil_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=rfc2_subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=rfc2_subtypes)
fig_2d.show()
fig_3d.show()
# Create a dataframe containing only twenty genes
rfc2_fil1_df = _df.filter(rfc2_fil_genes[0:1])
# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(rfc2_fil1_df)
proj_3d = umap_3d.fit_transform(rfc2_fil1_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=rfc2_subtypes)
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=rfc2_subtypes)
fig_2d.show()
fig_3d.show()
rfc2_fil_genes[0:1]
['miR-4324']
# Perform PCA
rfc2_pca2 = PCA(n_components=2)
rfc2_pca3 = PCA(n_components=3)
_df_pca2 = rfc2_pca2.fit_transform(rfc2_fil_df)
_df_pca3 = rfc2_pca3.fit_transform(rfc2_fil_df)
# Display PCA plot
pca_2d = px.scatter(_df_pca2, x=0, y=1, color=rfc2_subtypes)
# Display 3D PCA plot
pca_3d = px.scatter_3d(_df_pca3, x=0, y=1, z=2, color=rfc2_subtypes)
pca_2d.show()
pca_3d.show()
# Perform PCA
rfc2_pca2 = PCA(n_components=2)
rfc2_pca3 = PCA(n_components=3)
_df_pca2 = rfc2_pca2.fit_transform(_df.filter(rfc2_fil_genes[0:100]))
_df_pca3 = rfc2_pca3.fit_transform(_df.filter(rfc2_fil_genes[0:100]))
# Display PCA plot
pca_2d = px.scatter(_df_pca2, x=0, y=1, color=rfc2_subtypes)
# Display 3D PCA plot
pca_3d = px.scatter_3d(_df_pca3, x=0, y=1, z=2, color=rfc2_subtypes)
pca_2d.show()
pca_3d.show()
miract_df = pd.read_csv("pbta_actmir_analysis/pbta_miRactivity_results.csv")
miract_df
| Kids_First_Biospecimen_ID | sample_id | aliquot_id | RNA_library | short_histology | molecular_subtype | age_at_diagnosis_days | reported_gender | activity_score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | BS_BHR08WGW | 7316-100 | 601598 | stranded | Craniopharyngioma | CRANIO, ADAM | 2660 | Female | 0.289029 |
| 1 | BS_QV51J756 | 7316-101 | 588338 | stranded | Ganglioglioma | GNG, wildtype | 3472 | Male | 0.285792 |
| 2 | BS_QXKRN6CR | 7316-111 | 470423 | stranded | Ependymoma | EPN, ST RELA | 4519 | Female | 0.000118 |
| 3 | BS_23QW0BBA | 7316-114 | 577714 | stranded | HGAT | HGG, H3 wildtype | 3600 | Female | 0.125015 |
| 4 | BS_DPF1CX0G | 7316-117 | 570116 | stranded | LGAT | LGG, KIAA1549-BRAF | 3354 | Male | 0.051329 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | BS_AP0SZ364 | 7316-946 | 588444 | stranded | LGAT | LGG, other MAPK | 4513 | Male | 0.079672 |
| 240 | BS_86Q14TZG | 7316-949 | 734543 | stranded | Craniopharyngioma | CRANIO, ADAM | 565 | Male | 0.182583 |
| 241 | BS_ZR96A4FH | 7316-952 | 734519 | stranded | GNT | NaN | 1538 | Female | 0.273324 |
| 242 | BS_5YJEPPTP | 7316-954 | 549590 | stranded | LGAT | LGG, KIAA1549-BRAF | 1818 | Female | 0.155737 |
| 243 | BS_T4JFN54G | 7316-957 | 571420 | stranded | LGAT | LGG, other MAPK | 2810 | Female | 0.057152 |
244 rows × 9 columns
mb_miract_df = miract_df[miract_df["short_histology"] == "Medulloblastoma"]
mb_miract_df["activity_score"]
15 0.136787 35 0.317674 39 0.171482 66 0.203215 70 0.257868 76 0.169109 107 0.155369 112 0.245918 114 0.219721 117 0.090931 118 0.189761 122 0.262998 129 0.230809 130 0.124890 147 0.235020 156 0.276188 167 0.216900 178 0.208314 203 0.132491 222 0.151268 226 0.135921 231 0.152771 233 0.087101 236 0.162317 Name: activity_score, dtype: float64
mb_miract_df["activity_score"].count()
24